The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.
User defined parameters
print(params)
## $test_run
## [1] FALSE
##
## $save_figs
## [1] FALSE
##
## $hmod
## [1] FALSE
##
## $sample_group
## [1] 1
##
## $response
## [1] "C4GramCover_dec"
##
## $s
## [1] "C4GramCover"
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
hmod <- params$hmod # whether to include human modification in the model
# by changing the sample_group the model can be fit to a completely different set of rows
sample_group <- params$sample_group
response <- params$response
# _ann defines this model as being built on annual data
s <- params$s # string to paste to file names e.g., that defines the interactions in the model
# such as (summer ppt * MAT) and annuals*temperature interactions
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
library(caret)
library(betareg)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc.
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
Data compiled in the prepDataForModels.R script
modDat <- readRDS("../../../data/DataForModels.rds")
set.seed(1)
modDat_1 <- modDat
# small dataset for if testing the data
if(test_run) {
modDat_1 <- slice_sample(modDat_1, n = 1e5)
}
For now, not doing any resampling
set.seed(1234)
pred_vars <- c("swe_meanAnnAvg_30yr", "tmean_meanAnnAvg_30yr", "prcp_meanAnnTotal_30yr", "PrecipTempCorr_meanAnnAvg_30yr", "isothermality_meanAnnAvg_30yr", "annWetDegDays_meanAnnAvg_30yr")
# removed VPD, since it's highly correlated w/ tmean and prcp
names(pred_vars) <- pred_vars
# predictor vars are the same in both dfs
## remove rows for data that have NAs in the predictors (lag starts before range of DayMet data)
modDat_1 <- modDat_1 %>%
filter(!is.na(swe_meanAnnAvg_30yr) &
!is.na(tmin_meanAnnAvg_30yr) &
!is.na(prcp_meanAnnTotal_30yr) &
!is.na(precip_Seasonality_meanAnnAvg_30yr) &
!is.na(PrecipTempCorr_meanAnnAvg_30yr) &
!is.na(paste(response)) )
df_pred <- modDat_1[, pred_vars]
Training data
df_sample <- if(fit_sample & !test_run) {
reordered <- slice_sample(modDat_1, prop = 1)
low <- (sample_group - 1)*n_train + 1 # first row (of reordered data) to get
high <- low + n_train - 1 # last row
if(high > nrow(modDat_1)) {
stop('trying to sample from rows that dont exist')
}
reordered[low:high, ]
} else {
modDat_1
}
df_test <- if(fit_sample & !test_run &
# antijoin only works if there are enough rows that meet
# that criterion, i.e. if df_sample contains most of the data i
# doesnt' work
(nrow(modDat_1) - nrow(df_sample) > n_test)) {
modDat_1 %>%
anti_join(df_sample, by = c("cell_num", "year")) %>%
slice_sample(n = n_test)
} else {
modDat_1 %>%
slice_sample(n = n_test)
}
# small sample for certain plots
df_small <- slice_sample(modDat_1, n = 1e5)
create_summary <- function(df) {
df %>%
pivot_longer(cols = everything(),
names_to = 'variable') %>%
group_by(variable) %>%
summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>%
mutate(across(where(is.numeric), round, 4))
}
modDat_1[pred_vars] %>%
create_summary() %>%
knitr::kable(caption = 'summaries of predictor variables')
| variable | value_mean | value_min | value_median | value_max |
|---|---|---|---|---|
| PrecipTempCorr_meanAnnAvg_30yr | -0.1333 | -0.8556 | -0.1200 | 0.7145 |
| annWetDegDays_meanAnnAvg_30yr | 1656.2857 | 208.9903 | 1657.7056 | 2713.2516 |
| isothermality_meanAnnAvg_30yr | -37.1426 | -61.3018 | -36.8628 | -20.2171 |
| prcp_meanAnnTotal_30yr | 760.5631 | 52.1693 | 538.0903 | 4116.7860 |
| swe_meanAnnAvg_30yr | 16.1970 | 0.0000 | 2.7609 | 1471.5825 |
| tmean_meanAnnAvg_30yr | 6.8020 | 1.7479 | 6.8581 | 10.3473 |
response_summary <- modDat_1 %>%
dplyr::select(#where(is.numeric), -all_of(pred_vars),
matches(response)) %>%
create_summary()
kable(response_summary,
caption = 'summaries of response variables, calculated using paint')
| variable | value_mean | value_min | value_median | value_max |
|---|---|---|---|---|
| C4GramCover_dec | 0.0898 | 1e-04 | 1e-04 | 0.999 |
here using pred dataframe, because smaller and this code is slow.
df_pred %>%
slice_sample(n = 5e4) %>%
#select(-matches("_")) %>%
ggpairs(lower = list(continuous = GGally::wrap("points", alpha = 0.1, size=0.2)),
columnLabels = c("swe", "tmean", "prcp", "prcpTempCorr", "isothermality", "wetDegDays"))
# vectors of names of response variables
vars_response <- response
# longformat dataframes for making boxplots
df_sample_plots <- df_sample %>%
rename(response = all_of(response)) %>%
mutate(response = case_when(
response <= .25 ~ ".25",
response > .25 & response <=.5 ~ ".5",
response > .5 & response <=.75 ~ ".75",
response >= .75 ~ "1",
)) %>%
select(c(response, pred_vars)) %>%
pivot_longer(cols = names(pred_vars),
names_to = "predictor",
values_to = "value") %>%
filter(!is.na(response) & !is.na(value))
# df_biome_long <-
# # for some reason select() was giving me problems
# # adding numYrs here so can take weighted average
# predvars2long(modDat_1, response_vars = c(vars_response),
# pred_vars = pred_vars) #%>%
# # mutate(across(all_of(vars_nfire), factor))
ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
geom_boxplot() +
facet_wrap(~predictor, scales = 'free_y') +
ylab("Predictor Variable Values")
Creating formulas where each variable on its own is transformed numerous ways (including formula to fit spline), all other variables are left alone, that repeated for each variable. So have models with 1 variable transformed, 2 transformed, etc.
see documentation for glms_iterate_transforms, in the modelling_functions.R script
set.seed(1234)
# adding an interaction term to help deal with over-predicting fire probability
# at pixels with high afgAGB and high MAP. parentheses around interaction
# term are included so that glms_iterate_transforms doesn't transform
# the interaction term.
pred_vars_inter <- c(pred_vars, params$inter)
# pred_vars_inter <- pred_vars
mods_glm_cwf1 <- glmTransformsIterates(
preds = pred_vars_inter,
df = df_sample,
response_var = response,
delta_aic = 10)
## [1] "step1"
## Error in chol.default(K) : the leading minor of order 8 is not positive
## Error in chol.default(K) : the leading minor of order 2 is not positive
## Error in chol.default(K) : the leading minor of order 9 is not positive
## Error in chol.default(K) : the leading minor of order 9 is not positive
## Error in chol.default(K) : the leading minor of order 9 is not positive
##
## -117910.6
## -118097.2
## delta aic cutoff 10
## [1] "step2"
## Error in chol.default(K) : the leading minor of order 9 is not positive
## Error in chol.default(K) : the leading minor of order 9 is not positive
## Error in chol.default(K) : the leading minor of order 10 is not positive
## Error in chol.default(K) : the leading minor of order 10 is not positive
## Error in chol.default(K) : the leading minor of order 10 is not positive
##
## -118097.2
## -118245.4
## delta aic cutoff 10
## [1] "step3"
## Error in chol.default(K) : the leading minor of order 10 is not positive
## Error in chol.default(K) : the leading minor of order 11 is not positive
## Error in chol.default(K) : the leading minor of order 11 is not positive
##
## -118245.4
## -118311.7
## delta aic cutoff 10
## [1] "step4"
## Error in chol.default(K) : the leading minor of order 2 is not positive
## Error in chol.default(K) : the leading minor of order 12 is not positive
## Error in chol.default(K) : the leading minor of order 12 is not positive
##
## -118311.7
## -118394
## delta aic cutoff 10
## [1] "step5"
## Error in chol.default(K) : the leading minor of order 13 is not positive
## Error in chol.default(K) : the leading minor of order 13 is not positive
## Error in chol.default(K) : the leading minor of order 2 is not positive
##
## -118394
## -118409.2
## delta aic cutoff 10
## [1] "step6"
## Error in chol.default(K) : the leading minor of order 2 is not positive
##
## -118409.2
## -118420.4
## delta aic cutoff 10
# not including the last element of the list which is final_formula
mods_glm_cwf2 <- mods_glm_cwf1[-length(mods_glm_cwf1)]
map_dfc(mods_glm_cwf2, ~ names(.$aic[1:5])) %>%
kable(caption = "5 best transformations at each step")
| step1 | step2 | step3 | step4 | step5 | step6 |
|---|---|---|---|---|---|
| convert_poly2_isothermality_meanAnnAvg_30yr | convert_poly2sqrt_annWetDegDays_meanAnnAvg_30yr | convert_poly2_tmean_meanAnnAvg_30yr | convert_poly2log10_swe_meanAnnAvg_30yr | convert_poly2log10_prcp_meanAnnTotal_30yr | convert_poly2_PrecipTempCorr_meanAnnAvg_30yr |
| convert_poly2_annWetDegDays_meanAnnAvg_30yr | convert_poly2log10_prcp_meanAnnTotal_30yr | convert_poly2log10_swe_meanAnnAvg_30yr | convert_poly2sqrt_swe_meanAnnAvg_30yr | convert_log10_prcp_meanAnnTotal_30yr | convert_none |
| convert_poly2_tmean_meanAnnAvg_30yr | convert_poly2log10_annWetDegDays_meanAnnAvg_30yr | convert_poly2sqrt_tmean_meanAnnAvg_30yr | convert_log10_swe_meanAnnAvg_30yr | convert_sqrt_prcp_meanAnnTotal_30yr | convert_log10_PrecipTempCorr_meanAnnAvg_30yr |
| convert_poly2sqrt_annWetDegDays_meanAnnAvg_30yr | convert_log10_prcp_meanAnnTotal_30yr | convert_poly2log10_tmean_meanAnnAvg_30yr | convert_poly2sqrt_prcp_meanAnnTotal_30yr | convert_none | convert_poly2sqrt_PrecipTempCorr_meanAnnAvg_30yr |
| convert_poly2sqrt_tmean_meanAnnAvg_30yr | convert_poly2sqrt_prcp_meanAnnTotal_30yr | convert_poly2sqrt_swe_meanAnnAvg_30yr | convert_sqrt_swe_meanAnnAvg_30yr | convert_poly2_prcp_meanAnnTotal_30yr | convert_sqrt_PrecipTempCorr_meanAnnAvg_30yr |
best_aic_by_step <- map_dbl(mods_glm_cwf2, ~.$aic[1])
best_aic_by_step <- c(step0 = mods_glm_cwf1$step1$aic[['convert_none']],
best_aic_by_step)
# AIC improvement by step
diff(best_aic_by_step)
## step1 step2 step3 step4 step5 step6
## -186.62629 -148.19024 -66.27251 -82.32028 -15.24013 -11.20081
plot(y = best_aic_by_step,
x = 1:length(best_aic_by_step) - 1,
ylab = "AIC",
xlab = "Number of variables transformed")
Best transformation each step.
var_transformed <- map(mods_glm_cwf2, function(x) x$var_transformed)
var_transformed
## $step1
## [1] "stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)"
##
## $step2
## [1] "stats::poly(I(sqrt(annWetDegDays_meanAnnAvg_30yr)),2,raw=TRUE)"
##
## $step3
## [1] "stats::poly(tmean_meanAnnAvg_30yr,2,raw=TRUE)"
##
## $step4
## [1] "stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)"
##
## $step5
## [1] "stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)"
##
## $step6
## [1] "stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)"
Plot potential interaction terms using raw data
across values of swe
# calculate quantiles
swe_quants <- quantile(df_sample$swe_meanAnnAvg_30yr, na.rm = TRUE)
# longformat dataframe for making boxplots
df_sample_swe <- df_sample %>%
select(c(response, pred_vars)) %>%
# transform the predictors according to above process
mutate(isotherm_squared = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
wetDegDays_squared = as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
tmean_log = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
tmean_log_squared = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
prcp_log = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
prcp_log_squared = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
prcpTempCorr_squared = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,2])
) %>%
rename(response = all_of(response)) %>%
# mutate(response = case_when(
# response <= .25 ~ ".25",
# response > .25 & response <=.5 ~ ".5",
# response > .5 & response <=.75 ~ ".75",
# response >= .75 ~ "1",
# )) %>%
# pivot_longer(cols = 2:16,
# names_to = "predictor",
# values_to = "value") %>%
# filter(!is.na(response) & !is.na(value))
mutate(swe_meanAnnAvg_30yr_quants = case_when(
swe_meanAnnAvg_30yr <= swe_quants[2] ~ swe_quants[2],
swe_meanAnnAvg_30yr > swe_quants[2] & swe_meanAnnAvg_30yr <= swe_quants[3] ~ swe_quants[3],
swe_meanAnnAvg_30yr > swe_quants[3] & swe_meanAnnAvg_30yr <= swe_quants[4] ~ swe_quants[4],
swe_meanAnnAvg_30yr >= swe_quants[4] ~ swe_quants[5],
)) %>%
pivot_longer(cols = 2:16,
names_to = "predictor",
values_to = "value") %>%
filter(!is.na(response) & !is.na(value))
# plot relationships amongst variables according to different levels of SWE
ggplot(data = df_sample_swe[!(df_sample_swe$predictor %in% c("swe_log", "swe_log_squared", "swe_meanAnnAvg_30yr")),]) +
geom_point(aes(y = response, x = value, col = as.factor(swe_meanAnnAvg_30yr_quants)), alpha = .5) +
geom_smooth(aes(y = response, x = value, col = as.factor(swe_meanAnnAvg_30yr_quants)), method = "glm") +
facet_wrap(~predictor, scales = "free_x")
Mean Annual Temperature
tmean_quants <- quantile(df_sample$tmean_meanAnnAvg_30yr, na.rm = TRUE)
# longformat dataframe for making boxplots
df_sample_tmean <- df_sample %>%
select(c(response, pred_vars)) %>%
# transform the predictors according to above process
mutate(isotherm_squared = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
wetDegDays_squared = as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
tmean_log = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
tmean_log_squared = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
prcp_log = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
prcp_log_squared = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
prcpTempCorr_squared = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,2])
) %>%
rename(response = all_of(response)) %>%
# mutate(response = case_when(
# response <= .25 ~ ".25",
# response > .25 & response <=.5 ~ ".5",
# response > .5 & response <=.75 ~ ".75",
# response >= .75 ~ "1",
# )) %>%
# pivot_longer(cols = 2:16,
# names_to = "predictor",
# values_to = "value") %>%
# filter(!is.na(response) & !is.na(value))
mutate(tmean_meanAnnAvg_30yr_quants = case_when(
tmean_meanAnnAvg_30yr <= tmean_quants[2] ~ tmean_quants[2],
tmean_meanAnnAvg_30yr > tmean_quants[2] & tmean_meanAnnAvg_30yr <= tmean_quants[3] ~ tmean_quants[3],
tmean_meanAnnAvg_30yr > tmean_quants[3] & tmean_meanAnnAvg_30yr <= tmean_quants[4] ~ tmean_quants[4],
tmean_meanAnnAvg_30yr >= tmean_quants[4] ~ tmean_quants[5],
)) %>%
pivot_longer(cols = 2:16,
names_to = "predictor",
values_to = "value") %>%
filter(!is.na(response) & !is.na(value))
# plot relationships amongst variables according to different levels of tmean
ggplot(data = df_sample_tmean[!(df_sample_tmean$predictor %in% c("tmean_log", "tmean_log_squared", "tmean_meanAnnAvg_30yr")),]) +
geom_point(aes(y = response, x = value, col = as.factor(tmean_meanAnnAvg_30yr_quants)), alpha = .3) +
geom_smooth(aes(y = response, x = value, col = as.factor(tmean_meanAnnAvg_30yr_quants)), method = "glm") +
facet_wrap(~predictor, scales = "free_x")
Mean annual precipitation
prcp_quants <- quantile(df_sample$prcp_meanAnnTotal_30yr, na.rm = TRUE)
# longformat dataframe for making boxplots
df_sample_prcp <- df_sample %>%
select(c(response, pred_vars)) %>%
# transform the predictors according to above process
mutate(isotherm_squared = as.numeric(stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
wetDegDays_squared = as.numeric(stats::poly(annWetDegDays_meanAnnAvg_30yr,2,raw=TRUE)[,2]),
swe_log = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1]),
swe_log_squared = as.numeric(stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2]),
tmean_log = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,1] ),
tmean_log_squared = as.numeric(stats::poly(I(log10(I(tmean_meanAnnAvg_30yr+1))),2,raw=TRUE)[,2] ),
prcp_log = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,1]),
prcp_log_squared = as.numeric(stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE)[,2]),
prcpTempCorr_squared = as.numeric(stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2,raw=TRUE)[,2])
) %>%
rename(response = all_of(response)) %>%
# mutate(response = case_when(
# response <= .25 ~ ".25",
# response > .25 & response <=.5 ~ ".5",
# response > .5 & response <=.75 ~ ".75",
# response >= .75 ~ "1",
# )) %>%
# pivot_longer(cols = 2:16,
# names_to = "predictor",
# values_to = "value") %>%
# filter(!is.na(response) & !is.na(value))
mutate(prcp_meanAnnTotal_30yr_quants = case_when(
prcp_meanAnnTotal_30yr <= prcp_quants[2] ~ prcp_quants[2],
prcp_meanAnnTotal_30yr > prcp_quants[2] & prcp_meanAnnTotal_30yr <= prcp_quants[3] ~ prcp_quants[3],
prcp_meanAnnTotal_30yr > prcp_quants[3] & prcp_meanAnnTotal_30yr <= prcp_quants[4] ~ prcp_quants[4],
prcp_meanAnnTotal_30yr >= prcp_quants[4] ~ prcp_quants[5],
)) %>%
pivot_longer(cols = 2:16,
names_to = "predictor",
values_to = "value") %>%
filter(!is.na(response) & !is.na(value))
# plot relationships amongst variables according to different levels of tmean
ggplot(data = df_sample_prcp[!(df_sample_prcp$predictor %in% c("prcp_log", "prcp_log_squared", "prcp_meanAnnTotal_30yr")),]) +
geom_point(aes(y = response, x = value, col = as.factor(prcp_meanAnnTotal_30yr_quants)), alpha = .3) +
geom_smooth(aes(y = response, x = value, col = as.factor(prcp_meanAnnTotal_30yr_quants)), method = "glm") +
facet_wrap(~predictor, scales = "free_x")
best_form <- mods_glm_cwf1$final_formula
print(best_form)
## convert_poly2_PrecipTempCorr_meanAnnAvg_30yr
## "C4GramCover_dec ~ stats::poly(I(log10(I(swe_meanAnnAvg_30yr+1))),2,raw=TRUE) + stats::poly(tmean_meanAnnAvg_30yr,2,raw=TRUE) + stats::poly(I(log10(I(prcp_meanAnnTotal_30yr+1))),2,raw=TRUE) + stats::poly(PrecipTempCorr_meanAnnAvg_30yr,2, raw = TRUE) + stats::poly(isothermality_meanAnnAvg_30yr,2,raw=TRUE) + stats::poly(I(sqrt(annWetDegDays_meanAnnAvg_30yr)),2,raw=TRUE)"
# formula w/ interactions included
modWithInteractions <- paste(best_form, c("+ I(log10(I(swe_meanAnnAvg_30yr+1))):isothermality_meanAnnAvg_30yr + I(log10(I(swe_meanAnnAvg_30yr+1))):I(log10(I(tmean_meanAnnAvg_30yr+1))) + I(log10(I(tmean_meanAnnAvg_30yr+1))):I(log10(I(prcp_meanAnnTotal_30yr+1))) + I(log10(I(prcp_meanAnnTotal_30yr+1))):I(log10(I(swe_meanAnnAvg_30yr+1)))")) %>%
str_remove_all("stats::")
# refitting the glm with the best formula
mod_glm1 <- betareg(as.formula(modWithInteractions), data = df_sample, link = c("logit"), link.phi = NULL,
type = c("ML"))
Using the “StepBeta” function from the StepBeta R package
modSelection <- StepBeta_mine(mod_glm1, data = df_sample)
## [1] "100 % of the process"
## Error in chol.default(K) : the leading minor of order 6 is not positive
## Error in chol.default(K) : the leading minor of order 8 is not positive
## Error in chol.default(K) : the leading minor of order 8 is not positive
## Error in chol.default(K) : the leading minor of order 13 is not positive
## Error in chol.default(K) : the leading minor of order 13 is not positive
Response variables are the proportion of years in which fires occurred. Using best model formula selected in the previous step
best_form <- modSelection$formula
print(best_form)
## C4GramCover_dec ~ poly(PrecipTempCorr_meanAnnAvg_30yr, 2, raw = TRUE) +
## poly(I(log10(I(swe_meanAnnAvg_30yr + 1))), 2, raw = TRUE) +
## poly(I(sqrt(annWetDegDays_meanAnnAvg_30yr)), 2, raw = TRUE) +
## poly(tmean_meanAnnAvg_30yr, 2, raw = TRUE) + poly(isothermality_meanAnnAvg_30yr,
## 2, raw = TRUE)
## <environment: 0x39469c958>
# refitting the glm with the best formula
mod_glmFinal <- betareg(as.formula(best_form), data = df_sample, link = c("logit"), link.phi = NULL,
type = c("ML"))
# fit using glmmTMB for subsequent analyses (Same output)
#mod_glmFinal_glmmTMB <- glmmTMB::glmmTMB(as.formula(best_form), data = df_sample, family=beta_family(link="logit"))
# should be the same AIC (i.e. refitting the same model)
AIC(mod_glmFinal)
## [1] -118423.1
#summary(mod_glmFinal_glmmTMB)
PDP plot trend made using a small sample of the data
#vip::vip(mod_glmFinal, num_features = 15)
#pdp_all_vars(mod_glmFinal, mod_vars = pred_vars, ylab = 'probability',train = df_small)
#caret::varImp(mod_glmFinal)
Predicting on the data
# create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
df_out <- df
response_name <- paste0(response, "_pred")
df_out[[response_name]] <- predict(mod, df, type = 'response')
df_out
}
pred_glm1 <- predict_by_response(mod_glmFinal, df_test)
Binning predictor variables into deciles (actually percentiles) and looking at the mean predicted probability for each percentile. The use of the word deciles is just a legacy thing (they started out being actual deciles)
Then predicting on an identical dataset but with warming
var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)
pred_glm1_deciles <- predvars2deciles(pred_glm1,
response_vars = response_vars,
pred_vars = pred_vars)
Publication quality quantile plot
# publication quality version
g <- decile_dotplot_pq(pred_glm1_deciles, response = response)
if(!hmod) {
# obs/pred inset
g2 <- add_dotplot_inset(g, pred_glm1_deciles)
} else {
g2 <- g
}
g2
if(save_figs) {
png(paste0("figures/quantile_plots/quantile_plot_v5", s, ".png"),
units = "in", res = 600, width = 5.5, height = 3.5 )
print(g2)
dev.off()
}
20th and 80th percentiles for each climate variable
df <- pred_glm1[, pred_vars] #%>%
#mutate(MAT = MAT - 273.15) # k to c
map(df, quantile, probs = c(0.2, 0.8), na.rm = TRUE)
## $swe_meanAnnAvg_30yr
## 20% 80%
## 0.03122831 16.79101381
##
## $tmean_meanAnnAvg_30yr
## 20% 80%
## 5.852257 7.806215
##
## $prcp_meanAnnTotal_30yr
## 20% 80%
## 324.5293 1261.6810
##
## $PrecipTempCorr_meanAnnAvg_30yr
## 20% 80%
## -0.5890414 0.2725700
##
## $isothermality_meanAnnAvg_30yr
## 20% 80%
## -41.37623 -33.05065
##
## $annWetDegDays_meanAnnAvg_30yr
## 20% 80%
## 1345.267 2034.182
Filtered ‘Decile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower two deciles of each climate variable.
clim_vars <- c("swe_meanAnnAvg_30yr", "tmean_meanAnnAvg_30yr", "prcp_meanAnnTotal_30yr", "precip_Seasonality_meanAnnAvg_30yr", "isothermality_meanAnnAvg_30yr", "annWetDegDays_meanAnnAvg_30yr")
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1,
response_vars = response_vars,
pred_vars = pred_vars,
filter_var = TRUE,
filter_vars = pred_vars)
decile_dotplot_filtered_pq(pred_glm1_deciles_filt, xvars = clim_vars)
#decile_dotplot_filtered_pq(pred_glm1_deciles_filt)
Filtered quantile figure with middle 2 deciles also shown (this is very memory intensive so no running at the moment)
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1,
response_vars = response_vars,
pred_vars = pred_vars,
filter_vars = pred_vars,
filter_var = TRUE,
add_mid = TRUE)
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, xvars = pred_vars)
g
if(save_figs) {
jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", s, ".jpeg"),
units = "in", res = 600, width = 5.5, height = 6 )
g
dev.off()
}
# glm models
mods2save <- butcher::butcher(mod_glmFinal) # removes some model components so the saved object isn't huge
mods2save$formula <- best_form
mods2save$pred_vars_inter <- pred_vars_inter # so have interactions
n <- nrow(df_sample)
mods2save$data_rows <- n
if(!test_run) {
saveRDS(mods2save,
paste0("./models/glm_beta_model_FirstPass_", s, "_n", n,
#sample_group,
".RDS"))
}
Hash of current commit (i.e. to ID the version of the code used)
system("git rev-parse HEAD", intern=TRUE)
## [1] "12f42926c3fdbe0f575a7492a3aee5378726ee9f"
Packages etc.
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.4.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Denver
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] caret_6.0-94 lattice_0.22-6 ggtext_0.1.2 knitr_1.46 gridExtra_2.3 pdp_0.8.1
## [7] GGally_2.2.1 dtplyr_1.3.1 patchwork_1.2.0 lmtest_0.9-40 zoo_1.8-12 StepBeta_2.1.0
## [13] sf_1.0-16 corrplot_0.92 glmmTMB_1.1.9 betareg_3.1-4 statmod_1.5.0 terra_1.7-78
## [19] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [25] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 jsonlite_1.8.8 rstudioapi_0.16.0 magrittr_2.0.3
## [5] modeltools_0.2-23 farver_2.1.2 nloptr_2.0.3 rmarkdown_2.27
## [9] vctrs_0.6.5 minqa_1.2.7 butcher_0.3.4 htmltools_0.5.8.1
## [13] progress_1.2.3 Formula_1.2-5 pROC_1.18.5 sass_0.4.9
## [17] parallelly_1.37.1 bslib_0.7.0 KernSmooth_2.23-22 plyr_1.8.9
## [21] sandwich_3.1-0 cachem_1.1.0 TMB_1.9.12 commonmark_1.9.1
## [25] lifecycle_1.0.4 iterators_1.0.14 pkgconfig_2.0.3 Matrix_1.7-0
## [29] R6_2.5.1 fastmap_1.2.0 future_1.33.2 digest_0.6.35
## [33] numDeriv_2016.8-1.1 colorspace_2.1-0 pkgload_1.3.4 labeling_0.4.3
## [37] fansi_1.0.6 timechange_0.3.0 abind_1.4-5 mgcv_1.9-1
## [41] compiler_4.4.0 proxy_0.4-27 aod_1.3.3 withr_3.0.0
## [45] carData_3.0-5 DBI_1.2.3 ggstats_0.6.0 highr_0.10
## [49] MASS_7.3-60.2 lava_1.8.0 classInt_0.4-10 ModelMetrics_1.2.2.2
## [53] tools_4.4.0 units_0.8-5 future.apply_1.11.2 nnet_7.3-19
## [57] glue_1.7.0 nlme_3.1-164 gridtext_0.1.5 grid_4.4.0
## [61] reshape2_1.4.4 generics_0.1.3 recipes_1.1.0 gtable_0.3.5
## [65] tzdb_0.4.0 class_7.3-22 data.table_1.15.4 hms_1.1.3
## [69] xml2_1.3.6 car_3.1-2 utf8_1.2.4 flexmix_2.3-19
## [73] foreach_1.5.2 pillar_1.9.0 markdown_1.13 splines_4.4.0
## [77] survival_3.5-8 tidyselect_1.2.1 stats4_4.4.0 xfun_0.44
## [81] hardhat_1.4.0 timeDate_4032.109 stringi_1.8.4 yaml_2.3.8
## [85] boot_1.3-30 evaluate_0.23 codetools_0.2-20 vip_0.4.1
## [89] cli_3.6.2 rpart_4.1.23 munsell_0.5.1 jquerylib_0.1.4
## [93] Rcpp_1.0.12 globals_0.16.3 parallel_4.4.0 gower_1.0.1
## [97] prettyunits_1.2.0 lme4_1.1-35.3 listenv_0.9.1 ipred_0.9-15
## [101] scales_1.3.0 prodlim_2024.06.25 e1071_1.7-14 crayon_1.5.2
## [105] combinat_0.0-8 rlang_1.1.4 cowplot_1.1.3